###############################################################################
#   Copyright : SyN Lab, University of Texas at San Antonio.                  #
#   Authors   : Prof.Palden Lama, Mr.Kumar Thummapudi, Prof.Rajendra Boppana. #
#   Usage     : python3 prepare_data.py <HPC_folder> <IO_folder> <output_file>#
#   Note: The results may slightly vary based on the versions of the packages.#
###############################################################################
import os, re, sys
import pandas as pd
import numpy as np
import os.path
from scipy.stats import iqr

def readCSVs(path, mode):
  current_path = os.getcwd()
  os.chdir(path)
  dataset = pd.DataFrame()
  instance_size = 40
        
  # iterate through all files
  for file in os.listdir():
    # Check whether file is in text format or not
    if re.search("^.*csv$", file):
      #print(file)
      exp_round = file.split('_')[3] 
      app_name = file.split('_')[4]
      app_class   = int(file.split('_')[5][0])
      app_load = file.split('_')[1]
      
      df = pd.read_csv(file)
      if mode == 'HPC':
        row_limit = 12000 # 1 min data
        df = df[['L1-icache-load-misses', 'LLC-stores','branch-load-misses', 'instructions', 'node-load-misses']][0:row_limit]
      else:
        row_limit = 3000 # 1 min data
        df = df[['rd_req', 'rd_bytes', 'wr_req', 'wr_bytes', 'flush_operations', 'rd_total_times', 'wr_total_times', 'flush_total_times']][0:row_limit]  
      df.insert(0, 'SampleApp', app_name)  
      df.insert(0, 'SampleClass', app_class)  
      df.insert(0, 'Round', exp_round)  
      df.insert(0, 'SampleLoad', app_load)  

      if dataset.empty:                               
        dataset = pd.concat([dataset, df])                
      else:                                       
        if set(df.columns) == set(dataset.columns):
          dataset = pd.concat([dataset, df], ignore_index=True)
        else:
          print('The current file: ', file, ' events doesn\'t match with previous file events.')
          exit()

  os.chdir(current_path)
  print (dataset.shape)
  return dataset

def aggregate_metrics(df, instance_size, mode):
    if mode == 'HPC':
      df_features = df[['L1-icache-load-misses', 'LLC-stores','branch-load-misses', 'instructions', 'node-load-misses']]
    else:
      df_features = df[['rd_req', 'rd_bytes', 'wr_req', 'wr_bytes', 'flush_operations', 'rd_total_times', 'wr_total_times', 'flush_total_times']]

    df_mean = df_features.groupby(df_features.index//instance_size).mean()
    df_std = df_features.groupby(df_features.index//instance_size).std()
    df_median = df_features.groupby(df_features.index//instance_size).median()
    df_iqr = df_features.groupby(df_features.index//instance_size).agg(iqr)
            
    df_mean = df_mean.add_suffix('_mean')
    df_median = df_median.add_suffix('_median')
    df_std = df_std.add_suffix('_std')
    df_iqr = df_iqr.add_suffix('_iqr')
    df_labels = df[['SampleApp', 'SampleClass', 'Round', 'SampleLoad']]
    df_labels = df_labels.groupby(df_labels.index//instance_size).agg(pd.Series.mode)
    df = pd.concat([df_mean, df_median, df_std, df_iqr, df_labels], axis=1)

    print (df.shape)
    return df

if __name__ == "__main__":
  if len(sys.argv) ==4:
    hpc_path = sys.argv[1]
    io_path  = sys.argv[2]
    out_file = sys.argv[3]
  else:
    print("Usage: ", sys.argv[0], " <hpc_csvs_path> <io_csvs_path> <output_csv_file_name>")
    print("Ex: ", sys.argv[0], " HPC_csvs_directory IO_csvs_directory aggregate_data.csv")
    exit()
  if os.path.isdir(hpc_path) == False:
    print("HPCs directory doesn't exist.")
    exit()
  if os.path.isdir(io_path) == False:
    print("IOs directory doesn't exist.")
    exit()

  df_hpc = readCSVs(hpc_path, "HPC")
  df_io = readCSVs(io_path,"IO")
  df_agg_hpc = aggregate_metrics(df_hpc, 40, "HPC")
  df_agg_io = aggregate_metrics(df_io, 10, "IO")
  df = pd.concat([df_agg_hpc, df_agg_io], axis=1)
  df = df.loc[:, ~df.columns.duplicated()]
  print (df.columns, df.shape)
  print (df.SampleApp.unique(), df.Round.unique())
  if os.path.isfile(out_file):
      if input("The file already exists. Do you want me to overwrite? y/n: ") == "y":
        df.to_csv(out_file, index=False)
  df.to_csv(out_file, index=False)
